import numpy as np
import pandas as pd
import pandas_profiling as pf
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
# Reading data from csv file and naming the data frame as ccData - credit card data
ccData = pd.read_csv("Bank_Personal_Loan_Modelling.csv")
# Data types present in dataframe ccData
ccData.dtypes
#statistical summary
ccData.describe().transpose()
# shape of data
ccData.shape
# check for null values
ccData.isnull().sum()
# check for na values
ccData.isna().sum()
# incorrect imputation
ccData.head()
# Number of unique in column(s)
ccData.nunique()
# Number of unique in column Age - if we have to find for a perticular column
ccData.Age.nunique()
(ccData['Mortgage']==0).sum()
ccData.shape
(ccData['CCAvg']==0).sum()
ccData['Family'].value_counts()
ccData['Education'].value_counts()
# Let's get over all summary of data using profile_report
ccData.profile_report()
colums=ccData.columns
ccData[colums].hist(stacked=False, bins=10, figsize=(12,30), layout=(14,1));
sns.distplot(ccData['Income'])
sns.boxplot(ccData['Experience'])
sns.distplot(ccData['Age'])
sns.distplot(ccData['CD Account'])
sns.boxplot(x='Education',y='Income',hue='Personal Loan',data=ccData)
1- Customer who have education level 1 are most earning
2- Customers who have availed Personal Loan they have similar income regardless of eduction which means education doesn't impact the decision of availing personal loan
sns.boxplot(x="Education", y='Mortgage', hue="Personal Loan", data=ccData)
Customers with higher mortgage avail personal loans
sns.boxplot(x="Family", y='Mortgage', hue="Personal Loan", data=ccData)
Looks like family of any size will go for personal loans
familysize_no = np.mean( ccData[ccData['Personal Loan'] == 0]['Family'] )
familysize_no
familysize_no = np.mean( ccData[ccData['Personal Loan'] == 1]['Family'] )
familysize_no
Which means Family size does not have much impact decision to for loan
sns.scatterplot(x="CCAvg", y='CD Account', hue="Personal Loan", data=ccData);
sns.countplot(x="CCAvg", data=ccData,hue="Personal Loan")
sns.countplot(x="CD Account", data=ccData,hue="Personal Loan")
sns.countplot(x="CreditCard", data=ccData,hue="Personal Loan");
sns.distplot(ccData[ccData['Personal Loan']==0]['CCAvg'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['CCAvg'], color='g')
Customers who have personal load also have higher credit card average expenditure
sns.distplot(ccData[ccData['Personal Loan']==0]['Income'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['Income'], color='g')
Higher income less probability of buying personal loan
sns.distplot(ccData[ccData['Personal Loan']==0]['Education'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['Education'], color='g')
Higher education level more probability of buying personal loan
sns.distplot(ccData[ccData['Personal Loan']==0]['Mortgage'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['Mortgage'], color='g')
ccData.columns
sns.distplot(ccData[ccData['Personal Loan']==0]['CreditCard'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['CreditCard'], color='g')
sns.distplot(ccData[ccData['Personal Loan']==0]['Experience'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['Experience'], color='g')
sns.distplot(ccData[ccData['Personal Loan']==0]['CD Account'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['CD Account'], color='g')
sns.distplot(ccData[ccData['Personal Loan']==0]['Online'], color='y')
sns.distplot(ccData[ccData['Personal Loan']==1]['Online'], color='g')
corr = ccData.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)
#Looking into the distribution to the various attributes in relation with the target.
ccData.groupby(ccData['Personal Loan']).mean()
# Let's drop Experience which have some negative values, ID and Zip Code
ccData.drop(columns ='ID',inplace=True)
ccData.drop(columns ='Experience',inplace= True)
ccData.drop(columns ='ZIP Code',inplace= True)
ccData.head()
ccData['Personal Loan'].value_counts(normalize=True)
# highly imbalanced data, classes are not equally wighted
# That means we cannot rely on accuracy score alone
# Take target variable out but first let's set the independent variables
x = ccData.drop('Personal Loan', axis=1)
#Now let's set dependent or target variable
y = ccData['Personal Loan']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,random_state=1)
ccData.shape
x_train.shape
x_test.shape
x_train.describe().transpose()
x_test.describe().transpose()
#Let's check few things from statsmodel first
import statsmodels.api as sm
logit = sm.Logit(y_train, sm.add_constant(x_train))
lg = logit.fit()
lg.summary()
#calcualte odds ratio, probablity
lg.params
lgcoef = pd.DataFrame(lg.params, columns = ['coef'])
lgcoef['Odds_ratio'] = lgcoef['coef'].apply(lambda x: np.exp(x)) # odds ratio is calculated by etaking exp of coefficients
lgcoef
lgcoef['probability'] = lgcoef['Odds_ratio']/(1+lgcoef['Odds_ratio'])
lgcoef
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
y_pred = logreg.predict(x_test)
from sklearn.metrics import confusion_matrix, recall_score,precision_score,f1_score,roc_auc_score,accuracy_score
def draw_cm(actual,predicted):
cm = confusion_matrix(actual,predicted)
sns.heatmap(cm,annot=True, fmt='.2f', xticklabels=[0,1], yticklabels=[0,1])
plt.ylabel('observed')
plt.xlabel('Predicted')
plt.show()
logreg.score(x_train,y_train)
logreg.score(x_test,y_test)
draw_cm(y_test,y_pred)
print('Accuracy on train set: {:.2f}'.format(logreg.score(x_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(logreg.score(x_test, y_test)))
recall_score(y_test,y_pred)
precision_score(y_test,y_pred)
roc_auc_score(y_test,y_pred)
f1_score(y_test,y_pred)
#Summary of logistic regression
#from scipy import stats
#stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
#print(logreg.)
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(classification_report(y_test, y_pred))
#!pip install yellowbrick
from yellowbrick.classifier import ClassificationReport, ROCAUC
viz = ClassificationReport(LogisticRegression(random_state=42))
viz.fit(x_train,y_train)
viz.score(x_test,y_test)
viz.show()
roc = ROCAUC(LogisticRegression(random_state=42))
roc.fit(x_train,y_train)
roc.score(x_test,y_test)
roc.show()
logreg.get_params()
from sklearn.model_selection import GridSearchCV
param_grid = [{'solver': ['newton-cg','lbfgs','liblinear','sag','saga'], 'C': [0.001,0.01,0.1,0.25,0.5,0.75,1],
'class_weight':['balanced'], 'penalty':['l2']}]
grid_search = GridSearchCV(LogisticRegression(),param_grid,cv=5, verbose=0)
grid_search.fit(x_train,y_train)
grid_search.best_estimator_
model = LogisticRegression(C=0.1, class_weight='balanced', dual=False,
fit_intercept=True, intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
model.fit(x_train,y_train)
predictions = model.predict(x_test)
viz = ClassificationReport(model)
viz.fit(x_train,y_train)
viz.score(x_test,y_test)
viz.show()
# Not sure about this but want to check what happens if we put class_weight=None as we higly unbalanced data of target variable
model = LogisticRegression(C=0.1, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
model.fit(x_train,y_train)
predictions = model.predict(x_test)
viz = ClassificationReport(model)
viz.fit(x_train,y_train)
viz.score(x_test,y_test)
viz.show()
roc = ROCAUC(model)
roc.fit(x_train,y_train)
roc.score(x_test,y_test)
roc.show()
We should train the model with sufficiently large data set to avoid underfit/overfit issues
Target variable data is highly imbalanced, classes are not equally wightedl, that means we cannot rely on accuracy score alone.
We'll need more blanced data for model to perform better
Vary the solver of LogisticRegression For small datasets, 'liblinear' is a good choice, whereas 'sag' and 'saga' are faster for large ones.
Provide corect class_weight parameter to LogisticRegression A higher weightage to fields like income that has high correlation with the target varaible will yield better results
We can fine best best_estimator & re-apply to logistic regression model
# Let's looks at statistical inteference and odds ration, probability
lg.summary()
lgcoef
print(classification_report(y_test, y_pred))